Compound Events¶
In [1]:
# Imports
import os
import pandas as pd
import warnings
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import operator
import seaborn as sns
from collections import defaultdict
# Suppress warnings
warnings.filterwarnings('ignore')
# Configure matplotlib
plt.rcParams['figure.figsize'] = (15, 4)
plt.rcParams['figure.dpi'] = 600
plt.rcParams['font.size'] = 10
plt.rcParams['figure.titlesize'] = 15
plt.rcParams['axes.linewidth'] = 0.1
plt.rcParams['patch.linewidth'] = 0
plt.rcParams['grid.linewidth'] = 0.1
Initialization¶
In [2]:
# filename format: CENTER_var_scenario_stat.csv
path = center = 'LARC' # Data directory
months = [6, 7, 8] # Dry months (June, July, August)
month_title = '(JJA)'
temporal_res = ['daily', 'monthly_avg', 'annual_avg'][0] # Temporal resolution
variables = ['pr_', 'tasmax_']
hist = (1961, 1990) # Historical range
centers = ['AMES', 'GSFC', 'JPL', 'KSC', 'MSFC', 'MAF', 'GISS',
'LARC', 'SSC', 'GRC', 'WFF', 'JSC', 'WSTF', 'AFRC']
percentiles = {
'pr': ['<', 0.5], # 50th percentile for precipitation
'tasmax': ['>', 0.9] # 90th percentile for maximum temperature
}
# Comparative operators
comp_ops = {
'<': operator.lt,
'<=': operator.le,
'>': operator.gt,
'>=': operator.ge,
# '==': operator.eq,
# '!=': operator.ne
}
Preprocess Data¶
In [3]:
def get_files(path: str, center: str, variables: list = None):
'''
Returns list of filenames in a directory that contain a specific string.
Args:
path: the path to the data directory to search
center: the NASA center name in the filename
variables: optional list of variables to filter files (default: None)
'''
return [os.path.join(path, f) for f in os.listdir(path)
if center in f and f.endswith('.csv') and any(v in f for v in variables)]
def preprocess_file(filename: str, is_hist: bool, percentiles:dict):
'''
Returns preprocessed pandas DataFrame based on CSV file type (historical or SSP)
Args:
filename: Path to the CSV file
is_hist: Boolean indicating whether the file is historical (True) or SSP (False)
Returns:
- If historical: Processed DataFrame
- If SSP: (variable name, processed DataFrame)
'''
name = filename[:-4].split('_') # Extract variable name
df = pd.read_csv(filename).rename(columns={'Unnamed: 0': 'date'})
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
if name[1] in ['pr']:
# Calculate monthly mean per year while keeping original df dimensions
df = df.groupby(df.index.strftime('%Y-%m')).transform('mean')
# Convert Kelvin to Celsius for temperature data
if name[1] in ['tasmax', 'tas']:
df = df - 273.15
if is_hist:
# Calculate percentiles for each model
op, p = percentiles[name[1]]
df = df[(hist[0] <= df.index.year) & (df.index.year <= hist[1])]
df = df.groupby(df.index.strftime('%Y-%m')).mean()
return pd.DataFrame({f'{name[1]}_{op}_{p}': df.quantile(p)})
else:
df = df[df.index.month.isin(months)]
return ('_'.join(name[1:3]), df.add_suffix(f'_{name[1]}'))
def add_compound_flag(hist_df:pd.DataFrame, ssp_df:pd.DataFrame, percentiles:dict):
'''
Returns SSP dataframe with added compound flag based on historical percentiles
Args:
hist_df: Historical DataFrame
ssp_df: SSP DataFrame
'''
for model in hist_df.index:
flag = []
for var, (op, p) in percentiles.items():
# Check if ssp values exceed historical percentiles
perc = hist_df[f'{var}_{op}_{p}'].loc[model]
ssp_values = ssp_df[f'{model}_{var}'].to_list()
# Apply the comparison function
ssp_df[f'{model}_{var}_{op}_{p}'] = comp_ops[op](ssp_values, perc)
flag.append(ssp_df[f'{model}_{var}_{op}_{p}'].to_list())
# Calculate differity
ssp_df[f'{model}_{var}_differity'] = ssp_df[f'{model}_{var}'] - perc
ssp_df[f'{model}_compound'] = np.all(flag, axis=0)
return ssp_df
def process_data(files, percentiles):
"""
Process historical and SSP climate data files.
Args:
files (dict): Dictionary containing 'historical' and 'ssp' file lists
percentiles (dict): Dictionary of percentile thresholds for each variable
Returns:
tuple: (hist_df, ssp_dfs) where hist_df is the historical DataFrame and
ssp_dfs is a dictionary of SSP DataFrames
"""
# Process historical files and merge into a single DataFrame
hist_df = pd.concat([preprocess_file(f, True, percentiles)
for f in files['historical']], axis=1).dropna()
# Process SSP files and store in a dictionary
ssp_dfs = {name: df for name, df in (preprocess_file(f, False, percentiles)
for f in files['ssp'])}
# Merge all SSP data from the same scenario
grouped_data = defaultdict(list)
for key, df in ssp_dfs.items():
grouped_data[key.split('_')[-1]].append(df)
# Add flag to indicate compound events
ssp_dfs = {ssp: add_compound_flag(hist_df, pd.concat(dfs, axis=1), percentiles)
for ssp, dfs in grouped_data.items()}
return hist_df, ssp_dfs
In [4]:
# Get files for a center
files = sorted([f for f in get_files(path, center, variables) if temporal_res in f])
files = {key: [f for f in files if key in f] for key in ['historical', 'ssp']}
print(f'{len(files['historical']) + len(files['ssp'])} {center} files')
display(files)
hist_df, ssp_dfs = process_data(files, percentiles)
# # Dataframe column number check
# if ssp_dfs['ssp245'].shape[1] + len(hist_df) != ssp_dfs['ssp245'].shape[1]:
# raise ValueError('Number of flag columns is incorrect')
print(f'\nHistorical percentiles: {len(hist_df)} models used')
display(hist_df.head(2))
print(f'SSP {temporal_res.title()} Data')
print(ssp_dfs.keys())
# display(ssp_dfs['ssp245'].head(2))
print('\nAdded Compound Flag')
display(ssp_dfs['ssp245'].head(2))
8 LARC files
{'historical': ['LARC/LARC_pr_historical_daily.csv',
'LARC/LARC_tasmax_historical_daily.csv'],
'ssp': ['LARC/LARC_pr_ssp126_daily.csv',
'LARC/LARC_pr_ssp245_daily.csv',
'LARC/LARC_pr_ssp370_daily.csv',
'LARC/LARC_tasmax_ssp126_daily.csv',
'LARC/LARC_tasmax_ssp245_daily.csv',
'LARC/LARC_tasmax_ssp370_daily.csv']}
Historical percentiles: 21 models used
| pr_<_0.5 | tasmax_>_0.9 | |
|---|---|---|
| INM-CM4-8 | 0.000036 | 30.373966 |
| INM-CM5-0 | 0.000036 | 30.351364 |
SSP Daily Data dict_keys(['ssp126', 'ssp245', 'ssp370']) Added Compound Flag
| INM-CM4-8_pr | INM-CM5-0_pr | NorESM2-MM_pr | NorESM2-LM_pr | MIROC6_pr | GFDL-ESM4_pr | MIROC-ES2L_pr | GISS-E2-1-G_pr | FGOALS-g3_pr | MPI-ESM1-2-HR_pr | ... | CNRM-CM6-1_pr_<_0.5 | CNRM-CM6-1_pr_differity | CNRM-CM6-1_tasmax_>_0.9 | CNRM-CM6-1_tasmax_differity | CNRM-CM6-1_compound | KACE-1-0-G_pr_<_0.5 | KACE-1-0-G_pr_differity | KACE-1-0-G_tasmax_>_0.9 | KACE-1-0-G_tasmax_differity | KACE-1-0-G_compound | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||||||||||
| 2015-06-01 | 0.000047 | 0.000056 | 0.000038 | 0.000052 | 0.000034 | 0.000043 | 0.000047 | 0.000032 | 0.00003 | 0.000043 | ... | False | 0.000004 | False | -5.282323 | False | False | 0.000002 | True | 1.390501 | False |
| 2015-06-02 | 0.000047 | 0.000056 | 0.000038 | 0.000052 | 0.000034 | 0.000043 | 0.000047 | 0.000032 | 0.00003 | 0.000043 | ... | False | 0.000004 | False | -6.377073 | False | False | 0.000002 | True | 2.904082 | False |
2 rows × 148 columns
Check¶
In [5]:
m = 'INM-CM4-8'
aa = ssp_dfs['ssp245'].filter(regex=m)#.sum()
display(hist_df.loc[m])
display(aa[aa[f'{m}_compound']==True]#[aa[f'{m}_tasmax_>_0.9']==False]
.head(5))
pr_<_0.5 0.000036 tasmax_>_0.9 30.373966 Name: INM-CM4-8, dtype: float64
| INM-CM4-8_pr | INM-CM4-8_tasmax | INM-CM4-8_pr_<_0.5 | INM-CM4-8_pr_differity | INM-CM4-8_tasmax_>_0.9 | INM-CM4-8_tasmax_differity | INM-CM4-8_compound | |
|---|---|---|---|---|---|---|---|
| date | |||||||
| 2017-07-06 | 0.000031 | 32.343103 | True | -0.000005 | True | 1.969137 | True |
| 2017-07-07 | 0.000031 | 32.779413 | True | -0.000005 | True | 2.405447 | True |
| 2017-07-08 | 0.000031 | 35.188318 | True | -0.000005 | True | 4.814352 | True |
| 2017-07-09 | 0.000031 | 36.040735 | True | -0.000005 | True | 5.666769 | True |
| 2017-07-10 | 0.000031 | 37.428766 | True | -0.000005 | True | 7.054800 | True |
Analysis¶
In [6]:
def plot_all(df_dict, title_var, show_median=False):
# Distribution
for m, df in df_dict.items():
sns.kdeplot(data=df.mean(axis=1), label=f'{m}_mean',
linewidth=0.5, alpha=0.7)
if show_median:
sns.kdeplot(data=df.median(axis=1), label=f'{m}_median',
linewidth=0.5, alpha=0.7)
# if show_models:
# for col in df.columns:
# sns.kdeplot(data=df, x=col, #label=col.split('_')[0],
# color='grey', linewidth=0.2, alpha=0.4)
plt.title(f'{title_var} Per Year {month_title} Across CMIP6 Models')
plt.xlabel(title_var)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()
# Time Series
fig = go.Figure()
for m, df in df_dict.items():
fig.add_trace(go.Scatter(x=df.index, y=df.mean(axis=1), name=f'{m}_mean',
line=dict(width=0.8),
mode='lines', opacity=0.6))
if show_median:
fig.add_trace(go.Scatter(x=df.index, y=df.median(axis=1), name=f'{m}_median',
line=dict(width=0.8),
mode='lines', opacity=0.6))
fig.update_layout(
title=f'{title_var} Per Year {month_title} Across CMIP6 Models',
xaxis_title='Year', yaxis_title=title_var,
width=1000, height=300, margin=dict(l=20, r=20, t=30, b=20),
paper_bgcolor='white', plot_bgcolor='white',
xaxis=dict(showgrid=True, gridcolor='lightgrey', gridwidth=0.1),
yaxis=dict(showgrid=True, gridcolor='lightgrey', gridwidth=0.1),
)
fig.show()
Compound¶
In [7]:
# Average Compound Days
compound_days = {}
for m, df in ssp_dfs.items():
compound = (df.filter(regex='_compound$')
.groupby(df.index.strftime('%Y-%m')).sum().reset_index())
compound.date = pd.to_datetime(compound.date)
# year avg
compound = compound.groupby(compound.date.dt.year).mean().drop(columns='date')
# compound = compound.set_index('date') # keep (month, year)
compound_days[m] = compound
plot_all(compound_days, 'Average CDHE Days', show_median=False)